In [1]:
from anndata import AnnData
import scrublet as scr
import pandas as pd
import scanpy as sc
import numpy as np
import rbcde
import scipy
import os

sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-N2-joint-manifold/'
sc.logging.print_versions()
/usr/local/lib/python3.6/dist-packages/dask/config.py:161: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
  data = yaml.load(f.read()) or {}
scanpy==1.4.5.post2 anndata==0.6.22.post1 umap==0.3.10 numpy==1.18.1 scipy==1.4.1 pandas==0.25.3 scikit-learn==0.22.1 statsmodels==0.11.0rc2 python-igraph==0.7.1 louvain==0.6.1
In [2]:
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

Create fresh object from the embedded raw data, filter to cells that aren't doublets. Recreate raw, re-filter genes and recompute mitochondrial percentage. The cells are already filtered to exclude ones with fewer than 500 genes and more than 20% mitochondrial percentage from last time around. Store post-gene-minimum raw data in .raw. log(CPM/100 + 1) the data and keep that in .X.

In [3]:
adata = sc.read('endometrium-N1-doublet-removal.h5ad')
adata = AnnData(adata.raw.X, var=adata.raw.var, obs=adata.obs)
adata = adata[[not i for i in adata.obs['is_doublet_propagate']]]
adata.raw = adata.copy()

sc.pp.filter_genes(adata, min_cells=3)
#convert to lower to be species agnostic: human mito start with MT-, mouse with mt-
mito_genes = [name for name in adata.var_names if name.lower().startswith('mt-')]
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
filtered out 8140 genes that are detected in less than 3 cells
normalizing by total count per cell
    finished (0:00:04): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)

The rest of the analysis will be analogous to the one that was done to obtain the preliminary manifold with the doublets still in it.

Remove cell cycle genes by inverting the object and clustering the gene space, reporting back the cluster with known cycling genes as members.

In [4]:
bdata = adata.copy()
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
bdata = bdata[:, bdata.var['highly_variable']]
bdata = bdata.copy().T
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack')
sc.pl.pca_variance_ratio(bdata, log=True, save='_ccg_identification.pdf')
extracting highly variable genes
    finished (0:00:05)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
computing PCA with n_comps = 50
    finished (0:00:06)
WARNING: saving figure to file figures-N2-joint-manifold/pca_variance_ratio_ccg_identification.pdf
In [5]:
num_pcs = 20

sc.pp.neighbors(bdata,n_pcs=num_pcs)
sc.tl.umap(bdata)
sc.tl.leiden(bdata, resolution=0.4)
bdata.obs['known_cyclers'] = [i in ['CDK1','MKI67','CCNB2','PCNA'] for i in bdata.obs_names]
sc.pl.umap(bdata,color=['leiden','known_cyclers'],color_map='OrRd',save='_ccg_identification.pdf')

print(bdata.obs.loc[['CDK1','MKI67','CCNB2','PCNA'],'leiden'])
computing neighbors
    using 'X_pca' with n_pcs = 20
    finished: added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix (0:00:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:07)
running Leiden clustering
    finished: found 9 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
WARNING: saving figure to file figures-N2-joint-manifold/umap_ccg_identification.pdf
index
CDK1       8
MKI67      8
CCNB2      8
PCNA     NaN
Name: leiden, dtype: category
Categories (9, object): [0, 1, 2, 3, ..., 5, 6, 7, 8]
/usr/local/lib/python3.6/dist-packages/pandas/core/indexing.py:961: FutureWarning: 
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]

With the cycling genes identified, clean up the plotting folder by moving all related plots to a subfolder.

In [6]:
def MovePlots(plotpattern, subplotdir):
    if not os.path.exists(str(sc.settings.figdir)+subplotdir):
        os.makedirs(str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'*.* '+str(sc.settings.figdir)+'/'+subplotdir)

MovePlots('ccg_identification','ccg_identification')

Flag cell cycling genes and continue standard pipeline in a helper object until PCA.

In [7]:
adata.var['is_ccg'] = [i in bdata.obs[bdata.obs['leiden']=='8'].index for i in adata.var_names]
#compute HVG and PCA on separate helper object and port the results back to the original one
bdata = adata[:, ~adata.var['is_ccg']]
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
for col in ['highly_variable','means', 'dispersions', 'dispersions_norm']:
    adata.var[col] = bdata.var[col]
#fill NaNs with False so that subsetting to HVGs is possible
adata.var['highly_variable'].fillna(value=False, inplace=True)
bdata = bdata[:, bdata.var['highly_variable']]
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack', n_comps=50)
adata.obsm['X_pca'] = bdata.obsm['X_pca'].copy()
adata.uns['pca'] = bdata.uns['pca'].copy()
adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, 50))
adata.varm['PCs'][adata.var['highly_variable']] = bdata.varm['PCs']
sc.pl.pca_variance_ratio(adata, log=True, save='.pdf')
extracting highly variable genes
    finished (0:00:04)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
Trying to set attribute `.var` of view, making a copy.
/usr/local/lib/python3.6/dist-packages/scanpy/preprocessing/_simple.py:909: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)
computing PCA with n_comps = 50
    on highly variable genes
    finished (0:00:10)
WARNING: saving figure to file figures-N2-joint-manifold/pca_variance_ratio.pdf

Correct cross-individual batch effects with Harmony.

In [8]:
num_pcs = 20

pca = adata.obsm['X_pca'][:, :num_pcs]
batch = adata.obs['individual']
In [9]:
%load_ext rpy2.ipython
In [10]:
%%R -i pca -i batch -o hem

library(harmony)
library(magrittr)

#this gets "90% of the way" to determinism. better than nothing.
set.seed(1)

hem = HarmonyMatrix(pca, batch, theta=4, verbose=FALSE, do_pca=FALSE)
hem = data.frame(hem)
R[write to console]: Loading required package: Rcpp

Compute clustering and UMAP, plot metadata, clustering and individual location UMAPs.

In [11]:
adata.obsm['X_harmony'] = hem.values
sc.pp.neighbors(adata, use_rep='X_harmony')
sc.tl.leiden(adata, resolution = 0.5)
sc.tl.umap(adata)

meta = pd.read_csv('meta.csv',index_col=0)
plotmeta = list(meta.columns)
plotmeta.append('sample')

sc.pl.umap(adata, color=plotmeta, save='.pdf')
sc.pl.umap(adata, color='leiden', save='_clustering.pdf')
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_clustering_clusnumbers.pdf')
sc.pl.umap(adata, color='location', groups=['EN'], save='_location_EN.pdf')
sc.pl.umap(adata, color='location', groups=['MY'], save='_location_MY.pdf')
computing neighbors
/usr/local/lib/python3.6/dist-packages/numba/typed_passes.py:293: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../usr/local/lib/python3.6/dist-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
/usr/local/lib/python3.6/dist-packages/umap/nndescent.py:92: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../usr/local/lib/python3.6/dist-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  current_graph, n_vertices, n_neighbors, max_candidates, rng_state
/usr/local/lib/python3.6/dist-packages/numba/typed_passes.py:293: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../usr/local/lib/python3.6/dist-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
    finished: added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix (0:00:18)
running Leiden clustering
    finished: found 13 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:01:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:19)
WARNING: saving figure to file figures-N2-joint-manifold/umap.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap_clustering.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap_clustering_clusnumbers.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap_location_EN.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap_location_MY.pdf

Plot canonical markers for ease of annotation.

In [12]:
with open('markers.txt','r') as fid:
    markers = np.unique([line.rstrip() for line in fid.readlines()])

#make sure they're in the dataset, and sort them alphabetically for ease of finding things
markers = sorted([item for item in markers if item in adata.var_names])

for i in np.arange(np.ceil(len(markers)/4)):
    sc.pl.umap(adata, color=markers[int(i*4):int((i+1)*4)], use_raw=False, save='-markers'+str(int(i+1))+'.pdf', color_map='OrRd')

#goodbye clutter!
MovePlots('markers','markers')
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers1.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers2.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers3.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers4.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers5.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers6.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers7.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers8.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers9.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers10.pdf
WARNING: saving figure to file figures-N2-joint-manifold/umap-markers11.pdf

Identify cluster markers via the rank-biserial correlation, an effect size equivalent of the Wilcoxon test. Threshold the coefficient with the literature critical value for a large effect (0.5).

In [13]:
rbcde.RBC(adata, use_raw=False)
degs, plot_dict = rbcde.filter_markers(adata, use_raw=False, thresh=0.5)
computing rank-biserial correlation
	finished: added
	'RBC_' columns for each of the clusters to .var (0:02:05)
--> 62 markers found for cluster 0
--> 27 markers found for cluster 1
--> 103 markers found for cluster 10
--> 314 markers found for cluster 11
--> 138 markers found for cluster 12
--> 98 markers found for cluster 2
--> 61 markers found for cluster 3
--> 6 markers found for cluster 4
--> 0 markers found for cluster 5
--> 59 markers found for cluster 6
--> 33 markers found for cluster 7
--> 27 markers found for cluster 8
--> 39 markers found for cluster 9

It would appear that a cluster has no large effect markers. Compute median effect size (0.3).

In [14]:
degs_med, plot_dict_med = rbcde.filter_markers(adata, use_raw=False, thresh=0.3)
--> 270 markers found for cluster 0
--> 79 markers found for cluster 1
--> 317 markers found for cluster 10
--> 1038 markers found for cluster 11
--> 480 markers found for cluster 12
--> 290 markers found for cluster 2
--> 175 markers found for cluster 3
--> 49 markers found for cluster 4
--> 5 markers found for cluster 5
--> 244 markers found for cluster 6
--> 92 markers found for cluster 7
--> 110 markers found for cluster 8
--> 151 markers found for cluster 9

Okay, everything has markers now. Take large effect size for clusters that aren't 4 and 5, and report medium for those two.

In [15]:
lenient = ['4','5']
degs = degs_med.loc[~np.array([i not in lenient and j<0.5 for i,j in zip(degs_med['cluster'], degs_med['RBC'])]),:]
degs.to_csv(str(sc.settings.figdir)+'/cluster_markers_rbc.csv')
for clus in lenient:
    plot_dict[clus] = plot_dict_med[clus].copy()

adata_count = AnnData(np.expm1(adata.X), var=adata.var, obs=adata.obs)
for clus in plot_dict:
    degs_clus = degs.loc[degs['cluster']==clus,:].copy()
    degs_clus['log2_FC'] = 0
    degs_clus['percent_cluster'] = 0
    degs_clus['percent_rest'] = 0
    adata_clus = adata_count[adata.obs['leiden']==clus]
    adata_rest = adata_count[[not i for i in adata.obs['leiden']==clus]]
    adata_clus = adata_clus[:, degs_clus.index]
    adata_rest = adata_rest[:, degs_clus.index]
    degs_clus['log2_FC'] = np.asarray(np.log2(np.mean(adata_clus.X,axis=0)/np.mean(adata_rest.X,axis=0))).reshape(-1)
    #are you expressed?
    adata_clus.X.data = adata_clus.X.data > 0
    adata_rest.X.data = adata_rest.X.data > 0
    degs_clus['percent_cluster'] = np.asarray(100*np.sum(adata_clus.X,axis=0)/adata_clus.shape[0]).reshape(-1)
    degs_clus['percent_rest'] = np.asarray(100*np.sum(adata_rest.X,axis=0)/adata_rest.shape[0]).reshape(-1)
    degs_clus.to_csv(str(sc.settings.figdir)+'/cluster_markers_'+clus+'.csv')
    sc.pl.dotplot(adata, {clus: plot_dict[clus][:100]}, groupby='leiden', use_raw=False, save='_cluster_markers_'+clus+'.pdf')

#goodbye clutter!
MovePlots('cluster_markers','cluster_markers')
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_0.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_1.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_10.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_11.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_12.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_2.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_3.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_4.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_5.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_6.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_7.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_8.pdf
WARNING: saving figure to file figures-N2-joint-manifold/dotplot_cluster_markers_9.pdf

And with that, think we're good. Save the object for later.

In [16]:
adata.write('endometrium-N2-10x.h5ad')